* Set the working directory to the main one of the project

set more off

clear

use dsets\migration\migration_cz_origin_destination_prob_cpp_onlyyoung.dta, clear

ppmlhdfe prob_dest_given_origin same_cz distance, absorb(cz_origin cz_destination) cluster(cz_destination cz_origin)

*** Appendix Table E.5:
/*
HDFE PPML regression                              No. of obs      =    235,225
Absorbing 2 HDFE groups                           Residual df     =        484
Statistics robust to heteroskedasticity           Wald chi2(2)    =     694.59
Deviance             =  839.1903767               Prob > chi2     =     0.0000
Log pseudolikelihood = -2500.490551               Pseudo R2       =     0.2268

Number of clusters (cz_destination)=       485
Number of clusters (cz_origin)=       485
             (Std. Err. adjusted for 485 clusters in cz_destination cz_origin)
------------------------------------------------------------------------------
             |               Robust
prob_dest_~n |      Coef.   Std. Err.      z    P>|z|     [95% Conf. Interval]
-------------+----------------------------------------------------------------
     same_cz |   .7888229   .1203616     6.55   0.000     .5529185    1.024727
    distance |  -.0028124   .0001857   -15.15   0.000    -.0031763   -.0024484
       _cons |  -3.124194    .107835   -28.97   0.000    -3.335546   -2.912841
------------------------------------------------------------------------------
*/

******************************************************
* Now, compute probability of origin given destination. It will be useful to augment the gravity equation for knowledge flows

use dsets\temp_datasets_todelete\temp150.dta, clear

bys cz_destination: egen totweight_cz_dest = sum(perwt_adj2)
bys cz_origin cz_destination: egen totweight_cz_or_dest = sum(perwt_adj2)
gen prob_origin_given_dest = totweight_cz_or_dest / totweight_cz_dest

keep cz_origin cz_destination prob_origin_given_dest
duplicates drop cz_origin cz_destination, force /* NOTE: This step takes a long time! */

merge 1:1 cz_origin cz_destination using dsets\cz_data\cz_origin_destination_cpp.dta, nogen
replace prob_origin_given_dest = 0 if prob_origin_given_dest == .

save dsets\migration\migration_cz_origin_given_dest_cpp_onlyyoung.dta, replace

*** Create the dataset with bilateral distances for matlab:
use dsets\migration\cz_bilateral_distance.dta, clear
rename cz_origin cz
merge m:1 cz using dsets\cz_data\cz_to_consider_cpp.dta, keep(match) nogen
rename cz cz_origin
rename cz_destination cz
merge m:1 cz using dsets\cz_data\cz_to_consider_cpp.dta, keep(match) nogen
rename cz cz_destination
reshape wide distance, i(cz_origin) j(cz_destination)
drop cz_origin
export delimited using model\dsets_for_model\cz_bilateral_distance_wide_formatlab_cpp.csv, replace novarnames




